library(data.table)
## data.table 1.14.9 IN DEVELOPMENT built 2023-02-17 18:32:02 UTC; root using 2 threads (see ?getDTthreads).  Latest news: r-datatable.com
## **********
## This development version of data.table was built more than 4 weeks ago. Please update: data.table::update_dev_pkg()
## **********
library(ggplot2)
library(plyr)
library(magrittr)
library(readr)
library(stringr)
library(CellBarcode)
library(ggrepel)
library(ROCR)
library(ggsci)

theme0 <- theme_bw() + theme(
    text = element_text(size = 15),
    line = element_line(size = 1),
    axis.line = element_line(size = 1),
    axis.ticks.length = unit(3, units = "mm"),
    axis.text.x = element_text(
        margin = margin(t = 2, unit = "mm")
        , angle = 60, vjust = 1, size = 15, hjust = 1),
    axis.text.y = element_text(margin = margin(r = 3, l = 5, unit = "mm")),
    legend.position = "right",
) 
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
theme0_lt = theme0 + theme(legend.position = "top")
theme1 = theme0 + theme(
    axis.text.x = element_text( 
        margin = margin(t = 2, unit = "mm")
        , angle = 0, vjust = 1, size = 12, hjust = 0.5)
)

knitr::opts_chunk$set(fig.width=20, fig.height=12) 


################################################################################
#                               Analysis target                                #
################################################################################

Sample info

sample_info = fread("../run_simulation_no_umi/non_umi_simu_design_matrix.tsv")
sample_info$simu_id %<>% as.character
setkey(sample_info, "simu_id")

i_level_simu_name = sample_info[order(as.integer(simu_id)), simu_name][1:26]

Clone size disbribution

true_barcode_l = read_rds("../tmp/non_umi_simulation_ref.rds")
true_barcode_l = read_rds("../run_preprocess_simulation/tmp/non_umi_simulation_ref.rds")
d_true_barcode = true_barcode_l %>% rbindlist(idcol = "simu_file") 
x = d_true_barcode$simu_file %>% str_match("barcode_simulation_\\d+_simu_seq_(.*)") %>% extract(, c(2))
table(x, useNA = "always")
## x
##     1    10    11    12    13    14    15    16    17    18    19     2    20    21    22    23    24    25    26    27    28     3     4     5     6     7     8     9  <NA> 
##  8999  8998  8999  8998  8999  8999  8998  9000  8997  8998  8863  4499  8854  8999  6861  6869  6855 11971 20058 17997 35978 17989 35981  8998  8999  9000  8999  8996     0
d_true_barcode = cbind(d_true_barcode, simu_id = x)
# d_true_barcode[simu_name == "hamming" & grepl("simulation_19_", simu_file), .N, by = barcode_seq][N > 1]
# d_true_barcode[simu_name == "vdj_barcode" & grepl("simulation_1_", simu_file), .N, by = barcode_seq][N > 1]
# d_true_barcode[simu_name == "vdj_barcode"]
d_true_barcode$simu_name = sample_info[d_true_barcode$simu_id, simu_name]
table(d_true_barcode$simu_name, useNA = "always")
## 
##                                barcode_length_10     barcode_size_mean_3_variation_1_clone_n_1200      barcode_size_mean_3_variation_1_clone_n_600                                             base                                     clone_n_1200 
##                                             8999                                            35978                                            17997                                             8999                                            35981 
##                                      clone_n_150                                      clone_n_600                                         cycle_20                                         cycle_40                                   efficiency_0.5 
##                                             4499                                            17989                                             8996                                             8998                                             8999 
##                                   efficiency_0.9                                       error_1e-5                                       error_1e-7                                          hamming                         hamming_size_variation_3 
##                                             8998                                             8999                                             8999                                             8863                                             8854 
##                                 ngs_profile_MSv1                                     pcr_read_100                                    pcr_read_1000                                      pcr_read_25                                    size_mean_0.6 
##                                             8998                                             9000                                             8997                                             8998                                             8998 
##                                      size_mean_3                                 size_variation_1                                 size_variation_3                                      vdj_barcode vdj_barcode_size_mean_3_variation_1_clone_n_1200 
##                                             8999                                             9000                                             8999                                             6861                                            20058 
##  vdj_barcode_size_mean_3_variation_1_clone_n_600                     vdj_barcode_size_variation_1                     vdj_barcode_size_variation_3                                             <NA> 
##                                            11971                                             6869                                             6855                                                0
d_true_barcode = rename(d_true_barcode, c("seq" = "barcode_seq"))

Barcode frequency

ggplot(d_true_barcode[, .(freq = sum(freq)), by = .(simu_file, barcode_seq, simu_name)]) + aes(x = simu_name, y = freq) +
    geom_boxplot() +
    # geom_jitter(alpha = 0.2) + 
    theme0 + scale_y_log10() +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 53975 rows containing missing values (`stat_boxplot()`).

AUC

# NOTE: input
d_auc = fread("./tmp/table_auc_no_umi_clustering.tsv")
d_count_true_barcode = fread("tmp/table_count_true_barcode_clustering.tsv")

count - true barcode

ggplot(d_count_true_barcode) + aes(x = simu_name, y = count, color = factor(is_true)) +
    geom_boxplot(alpha = 0.2) +
    theme0_lt + scale_y_log10() + scale_color_npg() +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 71827 rows containing missing values (`stat_boxplot()`).
## Warning: Removed 14543 rows containing non-finite values (`stat_boxplot()`).

AUC versus simu conditions

ggplot(d_auc) + aes(x = simu_name, y = auc) + geom_boxplot() + theme0 +
    scale_x_discrete(limits = i_level_simu_name) +
    labs(y = "AUC") + lims(y = c(0, 1))
## Warning: Removed 60 rows containing missing values (`stat_boxplot()`).

ggplot(d_auc) + aes(x = simu_name, y = aucpr) + geom_boxplot() + theme0 +
    scale_x_discrete(limits = i_level_simu_name) +
    labs(y = "P-R AUC") + lims(y = c(0, 1))
## Warning: Removed 60 rows containing missing values (`stat_boxplot()`).

Resolusion index

d_count_true_barcode = fread("tmp/table_count_true_barcode_clustering.tsv")

Apply the autothreshold strategy

# NOTE: input
d = fread("./tmp/table_no_umi_auto_threshold_predict_rate_clustering.tsv")

d_plot_pr = melt(d, id.vars = "simu_name", measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value") 
## Warning in melt.data.table(d, id.vars = "simu_name", measure.vars = c("pre", : 'measure.vars' [pre, rec, ...] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'. All measure variables not of type 'double' will be coerced too.
## Check DETAILS in ?melt.data.table for more on coercion.
d_plot_tf = melt(d, id.vars = "simu_name", measure.vars=c("tpr", "fpr"), variable.name = "pr", value.name = "value") 
## Warning in melt.data.table(d, id.vars = "simu_name", measure.vars = c("tpr", : 'measure.vars' [tpr, fpr, ...] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'. All measure variables not of type 'double' will be coerced too.
## Check DETAILS in ?melt.data.table for more on coercion.
ggplot(d_plot_pr) + aes(x = simu_name, y = value, fill = factor(pr)) + 
    geom_boxplot() + theme0 +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_pr) + aes(x = simu_name, y = value) + 
    geom_boxplot() +
    theme0 + facet_grid(pr ~ .) +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_tf) + aes(x = simu_name, y = value) + 
    geom_boxplot() +
    theme0 + facet_grid(pr ~ .) +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

Apply the threshold of 0.0001 of left reads

# NOTE: input
d = fread("./tmp/table_no_umi_p0001_threshold_predict_rate_clustering.tsv")

d_plot_pr = melt(d, id.vars = "simu_name", measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value") 
## Warning in melt.data.table(d, id.vars = "simu_name", measure.vars = c("pre", : 'measure.vars' [pre, rec, ...] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'. All measure variables not of type 'double' will be coerced too.
## Check DETAILS in ?melt.data.table for more on coercion.
d_plot_tf = melt(d, id.vars = "simu_name", measure.vars=c("tpr", "fpr"), variable.name = "pr", value.name = "value") 
## Warning in melt.data.table(d, id.vars = "simu_name", measure.vars = c("tpr", : 'measure.vars' [tpr, fpr, ...] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'. All measure variables not of type 'double' will be coerced too.
## Check DETAILS in ?melt.data.table for more on coercion.
ggplot(d_plot_pr) + aes(x = simu_name, y = value, fill = factor(pr)) + 
    geom_boxplot() + theme0 +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_pr) + aes(x = simu_name, y = value) + 
    geom_boxplot() +
    theme0 + facet_grid(pr ~ .) +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_tf) + aes(x = simu_name, y = value) + 
    geom_boxplot() +
    theme0 + facet_grid(pr ~ .) +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

Merge threshold

d1 = fread("./tmp/table_no_umi_p0001_threshold_predict_rate_clustering.tsv")[, resolution := "res0.0001"]
d2 = fread("./tmp/table_no_umi_p00001_threshold_predict_rate_clustering.tsv")[, resolution := "res0.00001"]

d = rbindlist(list(d1, d2))
d = melt(d, id.vars = c("simu_name", "resolution"), measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value") 
d$resolution %<>% factor(levels = c("res0.0001", "res0.00001"))

ggplot(d) + aes(x = simu_name, y = value, color = resolution) +
    geom_boxplot() + 
    theme0 + facet_grid(pr ~ .) + scale_color_npg() +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 240 rows containing missing values (`stat_boxplot()`).